Set up Functions:
# Arguments:
# alpha = number of successes + 1
# beta = number of failures + 1
# dist_fun = distribution which revenue follows
# params = parameters for given input distribution
# MSamples = Number of samples to draw from prior distribution idealy +inf
# Assumptions:
# payer ~ Bernoulli(lambda)
# lambda ~ Beta(alpha, beta)
# revenue ~ D(params) <- specified as input
# ARPU = E(payer)*E(revenue) = lambda*1/omega
# all variables are calculated for A and B group
# Return value:
# probBbeatsA = probability that the B version has higher ARPU than the A version
# expLossA = expected loss on condition that we select A but B is better
# expLossB = expected loss on condition that we select B but A is better
# Implemented based on:
# VWO white paper: https://cdn2.hubspot.net/hubfs/310840/VWO_SmartStats_technical_whitepaper.pdf
# Pixel Federation Bayesian AB Testing: https://portal.pixelfederation.com/en/blog/article/ab-testing-methodology-change
bayes_arpu = function(
alphaA,
betaA,
paramsA,
dist_funA,
alphaB,
betaB,
paramsB,
dist_funB,
MSamples
) {
lambdaA = rbeta(MSamples, alphaA, betaA)
lambdaB = rbeta(MSamples, alphaB, betaB)
if (length(paramsA) == 1) { omegaA = dist_funA(MSamples, paramsA[1]) }
else { omegaA = dist_funA(MSamples, paramsA[1], paramsA[2]) }
if (length(paramsB) == 1) { omegaB = dist_funB(MSamples, paramsB[1]) }
else { omegaB = dist_funB(MSamples, paramsB[1], paramsB[2]) }
convProbBbeatsA = sum(lambdaB > lambdaA)/MSamples
diffTemp = lambdaB - lambdaA
convExpLossA = sum(diffTemp*(diffTemp > 0))/MSamples
convExpLossB = sum(-diffTemp*(-diffTemp > 0))/MSamples
revProbBbeatsA = sum(1/omegaB > 1/omegaA)/MSamples
diffTemp = 1/omegaB - 1/omegaA
revExpLossA = sum(diffTemp*(diffTemp > 0))/MSamples
revExpLossB = sum(-diffTemp*(-diffTemp > 0))/MSamples
arpuProbBbeatsA = sum(lambdaB/omegaB > lambdaA/omegaA)/MSamples
diffTemp = lambdaB/omegaB - lambdaA/omegaA
arpuExpLossA = sum(diffTemp*(diffTemp > 0))/MSamples
arpuExpLossB = sum(-diffTemp*(-diffTemp > 0))/MSamples
list(
convProbBbeatsA = convProbBbeatsA,
convExpLossA = convExpLossA,
convExpLossB = convExpLossB,
revProbBbeatsA = revProbBbeatsA,
revExpLossA = revExpLossA,
revExpLossB = revExpLossB,
arpuProbBbeatsA = arpuProbBbeatsA,
arpuExpLossA = arpuExpLossA,
arpuExpLossB = arpuExpLossB,
sampleLambdaA = lambdaA,
sampleLambdaB = lambdaB,
sampleOmegaA = omegaA,
sampleOmegaB = omegaB
)
}
# Calculates the HDI interval from a sample of representative values
# estimated as shortest credible interval
# Arguments:
# sampleVec is a vector of representative values from a probability
# distribution.
# credMass is a scalar between 0 and 1, indicating the mass within
# the credible interval that is to be estimated.
# Return value:
# Highest density iterval (HDI) limits in a vector
hdi_of_sample = function(
sampleVec, credMass = 0.95
) {
sortedPts <- sort(sampleVec)
sortedPtsLength <- length(sortedPts)
if(sortedPtsLength >= 3) {
ciIdxInc <- min(ceiling(credMass*sortedPtsLength), sortedPtsLength - 1)
nCIs <- sortedPtsLength - ciIdxInc
ciWidth <- rep(0, nCIs)
for (i in 1:nCIs) {
ciWidth[i] <- sortedPts[i + ciIdxInc] - sortedPts[i]
}
HDImin <- sortedPts[which.min(ciWidth)]
HDImax <- sortedPts[which.min(ciWidth) + ciIdxInc]
HDIlim <- c(HDImin, HDImax)
} else {
HDIlim <- c(min(sortedPts), max(sortedPts))
}
return(HDIlim)
}
# Creates plotly chart of approximate density based on a sample data
plot_sample_density = function(
sampleData, hdi = c(-0.1, 0.1)
) {
dens = density(sampleData, bw = 'nrd')
plotData = data.frame(x = dens$x, y = dens$y)
x = plotData$x
y = plotData$y
xArea = plotData[x >= hdi[1] & x <= hdi[2], ]$x
yArea = plotData[x >= hdi[1] & x <= hdi[2], ]$y
plot_ly(
x = x,
y = y,
type = 'scatter',
mode = 'lines',
name = 'ARPU B - A',
line = list(color = '#F27B0C'),
hoverinfo = 'none',
text = ~paste(sprintf('%.3g%%', x*100)),
showlegend = FALSE
) %>% add_trace(
x = xArea,
y = yArea,
type = 'scatter',
mode = 'lines',
fill = 'tozeroy',
fillcolor = 'rgba(242, 123, 12, 0.2)',
line = list(color = '#F27B0C'),
hoverinfo = 'none',
text = ~paste(sprintf('%.3g%%', xArea*100)),
showlegend = FALSE
) %>% layout(
title = 'Approximate distribution of difference of ARPU B - A',
title = list(font = 14),
xaxis = list(showgrid = FALSE, fixedrange = TRUE),
yaxis = list(showline = FALSE, showticklabels = FALSE, showgrid = FALSE, fixedrange = TRUE),
legend = list(x = 0.9, y = 0.9)
) %>% config(displayModeBar = FALSE)
}
# Creates plotly chart of approximate densities based on a sample data
plot_sample_densities = function(
sampleDataA, hdiA = c(-0.1, 0.1),
sampleDataB, hdiB = c(-0.1, 0.1),
nameA, nameB
) {
densA = density(sampleDataA, bw = 'nrd')
densB = density(sampleDataB, bw = 'nrd')
plotDataA = data.frame(x = densA$x, y = densA$y)
plotDataB = data.frame(x = densB$x, y = densB$y)
xA = plotDataA$x
yA = plotDataA$y
xB = plotDataB$x
yB = plotDataB$y
xAreaA = plotDataA[xA >= hdiA[1] & xA <= hdiA[2], ]$x
yAreaA = plotDataA[xA >= hdiA[1] & xA <= hdiA[2], ]$y
xAreaB = plotDataB[xB >= hdiB[1] & xB <= hdiB[2], ]$x
yAreaB = plotDataB[xB >= hdiB[1] & xB <= hdiB[2], ]$y
plot_ly(
x = xA,
y = yA,
type = 'scatter',
mode = 'lines',
name = nameA,
line = list(color = '#5BC0DE'),
hoverinfo = 'none'
) %>% add_trace(
x = xAreaA,
y = yAreaA,
type = 'scatter',
mode = 'lines',
fill = 'tozeroy',
fillcolor = 'rgba(91, 192, 222, 0.2)',
line = list(color = '#5BC0DE'),
hoverinfo = 'none',
showlegend = FALSE
) %>% add_trace(
x = xB,
y = yB,
type = 'scatter',
mode = 'lines',
name = nameB,
line = list(color = '#5CB85C'),
hoverinfo = 'none'
) %>% add_trace(
x = xAreaB,
y = yAreaB,
type = 'scatter',
mode = 'lines',
fill = 'tozeroy',
fillcolor = 'rgba(92, 184, 92, 0.2)',
line = list(color = '#5CB85C'),
hoverinfo = 'none',
showlegend = FALSE
) %>% layout(
title = 'Distribution of ARPU A and B',
title = list(font = 14),
xaxis = list(showgrid = FALSE, fixedrange = TRUE),
yaxis = list(showline = FALSE, showticklabels = FALSE, showgrid = FALSE, fixedrange = TRUE),
legend = list(x = 0.9, y = 0.9)
) %>% config(displayModeBar = FALSE)
}
Running an example
n_trial_a = nrow(df_all[df_all$test_group == 'C', ])
n_trial_b = nrow(df_all[df_all$test_group == 'P', ])
n_success_a = nrow(df_payers[df_payers$test_group == 'C', ])
n_success_b = nrow(df_payers[df_payers$test_group == 'P', ])
revenue_a = sum(df_all[df_all$test_group == 'C', ]$total_wins_spend)
revenue_b = sum(df_all[df_all$test_group == 'P', ]$total_wins_spend)
name_A = 'Control'
name_B = 'Personalised'
paramsA = get_dist(df_payers[df_payers$test_group == 'C', ]$total_wins_spend, distributions, distribution_name, F, F)
paramsB = get_dist(df_payers[df_payers$test_group == 'P', ]$total_wins_spend, distributions, distribution_name, F, F)
res = bayes_arpu(
alphaA = n_success_a+1,
betaA = n_trial_a-n_success_a+1,
paramsA = paramsA$param_est,
dist_funA = paramsA$model_used,
alphaB = n_success_b+1,
betaB = n_trial_a-n_success_a+1,
paramsB = paramsB$param_est,
dist_funB = paramsB$model_used,
MSamples = 100
)
post_sample_A = res$sampleLambdaA/res$sampleOmegaA
post_sample_B = res$sampleLambdaB/res$sampleOmegaB
diff_post_sample = post_sample_B - post_sample_A
hdi_A = hdi_of_sample(post_sample_A)
hdi_B = hdi_of_sample(post_sample_B)
hdi_diff = hdi_of_sample(diff_post_sample)
x_lim = {
a = min(hdi_A, hdi_B)
b = max(hdi_A, hdi_B)
c(1.2*a-0.2*b, 1.2*b-0.2*a)
}
x_lim_diff = {
a = hdi_diff[1]
b = hdi_diff[2]
c(1.2*a-0.2*b, 1.2*b-0.2*a)
}
plot_sample_density(
sample = diff_post_sample,
hdi = hdi_diff
)
plot_sample_densities(
sampleDataA = post_sample_A,
sampleDataB = post_sample_B,
hdiA = hdi_A,
hdiB = hdi_B,
name_A,
name_B
)
tab = data.frame(matrix(
c(n_trial_a,
sprintf('%.3g%%', n_success_a/n_trial_a*100),
sprintf('%.4g€', revenue_a/n_trial_a),
sprintf('%.4g€', revenue_a/n_success_a),
sprintf('[%.3g€, %.3g€]', hdi_A[1], hdi_A[2]),
n_trial_b,
sprintf('%.3g%%', n_success_b/n_trial_b*100),
sprintf('%.4g€', revenue_b/n_trial_b),
sprintf('%.4g€', revenue_b/n_success_b),
sprintf('[%.3g€, %.3g€]', hdi_A[1], hdi_A[2]),
sprintf('N/A'),
sprintf('%.3g%%', n_success_b/n_trial_b*100 - n_success_a/n_trial_a*100),
sprintf('%.4g€', revenue_b/n_trial_b - revenue_a/n_trial_a),
sprintf('%.4g€', revenue_b/n_success_b - revenue_a/n_success_a),
sprintf('[%.3g€, %.3g€]', hdi_diff[1], hdi_diff[2])
),
nrow = 5, ncol = 3, byrow = F,
dimnames = list(c('sample size', 'conversion', 'ARPU', 'ARPPU', '95% HDI'),
c(name_A, name_B, paste(name_B, '-', name_A)))
)
)
tab
tab2 <- data.frame(matrix(
c(sprintf('%.1f%%', res$convProbBbeatsA*100),
sprintf('%.2g%%', res$convExpLossA*100),
sprintf('%.2g%%', res$convExpLossB*100),
sprintf('%.1f%%', res$revProbBbeatsA*100),
sprintf('%.2g €', res$revExpLossA),
sprintf('%.2g €', res$revExpLossB),
sprintf('%.1f%%', res$arpuProbBbeatsA*100),
sprintf('%.2g €', res$arpuExpLossA),
sprintf('%.2g €', res$arpuExpLossB)
),
nrow = 3, ncol = 3, byrow = F,
dimnames = list(c('P(B > A)', 'E(Uplift | B > A)', 'E(Downlift | B < A)'),
c('conversion', 'ARPPU', 'ARPU'))
)
)
tab2